package org.apache.spark.ml.feature

import org.ansj.splitWord.analysis.ToAnalysis
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors}
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.types.DataType

import scala.collection.JavaConversions._

/**
  * Created by peibin on 2017/5/8.
  */
class TianchiWeiboExtractor(override val uid: String)
  extends UnaryTransformer[String, Vector, TianchiWeiboExtractor] with DefaultParamsWritable {

  def this() = this(Identifiable.randomUID("MyExtractor"))

  override protected def createTransformFunc: (String) => Vector = x => {
    // 微博长短
    val wordLength = ToAnalysis.parse(x).getTerms.map(_.getRealName).length.toDouble
    // 有没有引用微博话题
    val topic = if (x.matches("#.*#")) 1.0 else 0.0
    val link = if (x.matches("http://")) 1.0 else 0.0
    val at = if (x.matches("@")) 1.0 else 0.0

    Vectors.dense(wordLength, topic, link, at)
  }

  override protected def outputDataType: DataType = new VectorUDT()

  //override val uid: String = Identifiable.randomUID("Splitter")


}
